# pip install folium
# conda install gensim
import numpy as np
import pandas as pd
from IPython.display import Image
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm
import scipy.stats as stats
import folium
import seaborn as sns
from datetime import datetime
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.tokenize import WhitespaceTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
df = pd.read_csv('data/Hotel_Reviews.csv', encoding = 'utf-8')
print(df.shape)
df.head()
print(df.isnull().sum())
print(df.info())
def autolabel(rects):
"""Autolabel chart function.
Arguments
rects: Input tensor
Returns
Attach a text label above each bar in *rects*, displalying its height.
"""
for rect in rects:
height = rect.get_height()
ax.annotate('{}'.format(height),
xy=(rect.get_x() + rect.get_width() / 2, height),
xytext=(0, 3), # 3 points vertical offset
textcoords="offset points",
ha='center', va='bottom')
def bar_chart(labels, freq, size, title, ylabel, angel):
"""Bar chart function.
Arguments
labels: x-axis label.
freq: y-axis data.
size: figure size.
title: figure title.
ylabel: name of y.
angel: rotation angel of x label.
Returns
barchart
"""
labels = labels.tolist()
freq = freq.tolist()
x = np.arange(len(labels)) # the label locations
width = 0.5 # the width of the bars
fig, ax = plt.subplots(figsize = size)
rects = ax.bar(x, freq, width, label=ylabel)
ax.set_ylabel(ylabel)
ax.set_title(title)
ax.set_xticks(x)
ax.set_xticklabels(labels, rotation = angel)
ax.legend()
# autolabel(rects)
fig.tight_layout()
plt.show()
def frequency(col):
"""Feature frequency count function.
Arguments
col: column name that want to be group by
Returns
new dataframe
"""
df_freq = df.groupby(col)[col].agg({'count'}).reset_index().sort_values(by = 'count', ascending = False)
return df_freq
hotel = frequency('Hotel_Name')
hotel.to_csv('data/hotel.csv', index = False)
print("The most popular hotel is: " + hotel[:1]['Hotel_Name'].values.tolist()[0] + '.')
print("The most unpopular hotel is: " + hotel[-1:]['Hotel_Name'].values.tolist()[0] + '.')
# only show top 20 hotels in this bar chart
hotel = hotel[:20]
bar_chart(hotel['Hotel_Name'], hotel['count'], (16,12), 'Freq', 'Frequency of Hotel', 90)
Based on the bar chart, the Britannia International Hotel Canary Wharf, Strand Palace Hotel, and Park Plaza Westminster Bridge London are the top 3 frequent hotels that travelers stay.
location = df[['Hotel_Name', 'lat', 'lng']]
# drop hotel without location
location.dropna(axis = 0, how = 'any', inplace = True)
location.drop_duplicates('Hotel_Name', 'first', inplace = True)
location.reset_index(drop = True, inplace = True)
location.to_csv('data/location.csv', index = False)
location.head()
# set the center point on Switherland
latitude = 47.22
longitude = 8.32
from folium import plugins
# start a clean copy of the map of europe
hotel_map = folium.Map(location = [latitude, longitude], zoom_start = 5)
# instantiate a mark cluster object for the incidents in the dataframe
incidents = plugins.MarkerCluster().add_to(hotel_map)
# loop through the dataframe and add each data point to the mark cluster
for lat, lng, label, in zip(location['lat'], location['lng'], location['Hotel_Name']):
folium.Marker(
location=[lat, lng],
icon=None,
popup=label,
).add_to(incidents)
# add incidents to map
hotel_map.add_child(incidents)
The hotel mainly scatters in France and the United Kingdom. Other hotels located in Spain, Italy, the Republic of Austria, and the Netherland. It shows that people always travel to these destinations.
# Set the center point on London
latitude = 51.513981
longitude = -0.133795
# Create map and display it
hotel_map = folium.Map(location=[latitude, longitude], zoom_start=12)
incidents = folium.map.FeatureGroup()
# Loop through the all location points and add each to the incidents feature group
for lat, lng, in zip(location['lat'], location['lng']):
incidents.add_child(
folium.CircleMarker(
[lat, lng],
radius=7, # define how big you want the circle markers to be
color='yellow', # set the location point color
)
)
latitudes = location['lat'].tolist()
longitudes = location['lng'].tolist()
labels = location['Hotel_Name'].tolist()
# add labels
for lat, lng, label in zip(latitudes, longitudes, labels):
folium.Marker([lat, lng], popup=label).add_to(hotel_map)
# add incidents to map
hotel_map.add_child(incidents)
# creat new columns to record the year and month separately
df['Review_Date_YMD'] = df['Review_Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y'))
df['Review_Date_Y'] = df['Review_Date_YMD'].apply(lambda x: x.year)
df['Review_Date_M'] = df['Review_Date_YMD'].apply(lambda x: x.month)
df.head()
# divide the checkout data by year, use groupby to count the daily checkout number
checkout = df.groupby('Review_Date_YMD')['Review_Date_YMD'].agg({'count'}).reset_index().sort_values(by = 'Review_Date_YMD')
checkout_2015 = checkout[checkout['Review_Date_YMD'] < '2016-01-01']
checkout_2016 = checkout[(checkout['Review_Date_YMD'] < '2017-01-01') & (checkout['Review_Date_YMD'] > '2015-12-31')]
checkout_2017 = checkout[checkout['Review_Date_YMD'] > '2016-12-31']
checkout.to_csv('data/checkout.csv', index = False)
# plot the checkout trend separately in 2015, 2016 and 2017
label1 = checkout_2015['Review_Date_YMD']
count1 = checkout_2015['count']
label2 = checkout_2016['Review_Date_YMD']
count2 = checkout_2016['count']
label3 = checkout_2017['Review_Date_YMD']
count3 = checkout_2017['count']
fig, ax = plt.subplots(3, 1, figsize = (20,15))
ax[0].plot(label1, count1, label="count")
ax[0].legend()
ax[1].plot(label2, count2, label="count")
ax[1].legend()
ax[2].plot(label3, count3, label="count")
ax[2].legend()
plt.show()
Based on these three line charts, we can see each month has several checkout peaks, the period between the bottom and peak should nearly equal to travelers' length of stay.
# count reviewr's country
country = frequency('Reviewer_Nationality')
country.to_csv('data/country.csv', index = False)
country.head()
country = country[:30]
bar_chart(country['Reviewer_Nationality'], country['count'], (16, 12), 'Nationality of Reviewers', 'Counts', 90)
The United Kindom has the highest traveler numbers, the counts are several times as other countries.
avg_score = df.groupby('Hotel_Name').agg({'Average_Score': np.average}).reset_index().sort_values(by = 'Average_Score', ascending = False)
avg_score.to_csv('data/avg_score.csv', index = False)
print("The hotel with the highest average score is: " + avg_score[:1]['Hotel_Name'].values.tolist()[0] + ", " + str(avg_score[:1]['Average_Score'].values.tolist()[0]) + '.')
print("The hotel with the lowest average score is: " + avg_score[-1:]['Hotel_Name'].values.tolist()[0] + ", " + str(avg_score[-1:]['Average_Score'].values.tolist()[0]) + '.')
bar_chart(avg_score['Hotel_Name'][:20], avg_score['Average_Score'][:20], (16,10), 'Top 20 Rated Hotels', 'Score', 90)
def hotel_level(score):
"""Divide hotel level function.
Arguments
score: hotel average score.
Returns
'Bad, normal, good', three levels of the hotel.
"""
if score < 7:
return 'bad'
elif score < 8:
return 'normal'
else:
return 'good'
avg_score = df.groupby('Hotel_Name').agg({'Average_Score': np.average}).reset_index()
avg_score['attitude'] = avg_score['Average_Score'].apply(lambda x: hotel_level(x))
level = avg_score.groupby('attitude')['attitude'].agg({'count'}).reset_index()
level.sort_values(by = 'count', inplace = True)
level.to_csv('data/level.csv', index = False)
level
bar_chart(level['attitude'], level['count'], (8,6), 'Hotel Level Counts', 'Counts', 0)
from wordcloud import WordCloud
def show_wordcloud(data):
"""Wordcloud function.
Arguments
data: input text
Returns
wordcloud image
"""
wordcloud = WordCloud(
background_color = 'white',
max_words = 200,
max_font_size = 40,
scale = 3,
random_state = 42).generate(str(data))
fig = plt.figure(1, figsize = (20, 20))
plt.axis('off')
plt.imshow(wordcloud)
plt.show()
# replace all 'no negative' and 'no positive' to '', and merge the negative and positive review
review = df[['Hotel_Name', 'Reviewer_Score', 'Negative_Review', 'Positive_Review']]
review['Negative_Review'].replace('No Negative', '', inplace = True)
review['Positive_Review'].replace('No Positive', '', inplace = True)
review['Review'] = review['Negative_Review'] + review['Positive_Review']
# classify the negative review based on reviewer score, 1 is negative, 0 is positive
review['is_neg'] = review['Reviewer_Score'].apply(lambda x: 1 if x < 5 else 0)
review.head()
def get_wordnet_pos(pos_tag):
"""Pos tag word function
Arguments
pos_tag: the abbr of part of speech
Returns
wordnet type
"""
if pos_tag.startswith('J'):
return wordnet.ADJ
elif pos_tag.startswith('V'):
return wordnet.VERB
elif pos_tag.startswith('N'):
return wordnet.NOUN
elif pos_tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
def clean_text(text):
"""Clean text function
Arguments
text: the text gonna be cleaned
Returns
text after cleaned
"""
# lower
text = text.lower()
# tokenize and remove punctuation
text = [word.strip(string.punctuation) for word in text.split(" ")]
# remove words that contains numbers
text = [word for word in text if not any(c.isdigit() for c in word)]
# remove stopwords
stop = stopwords.words('english')
text = [w for w in text if w not in stop]
# remove empty tokens
text = [w for w in text if len(w) > 0]
# pos tag text
pos_tags = pos_tag(text)
# lemmatize text
text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
# remove words with only one letter
text = [w for w in text if len(w) > 1]
# join all by space
text = " ".join(text)
return (text)
review['clean'] = review['Review'].apply(lambda x: clean_text(x))
review.head()
# sentiment classifier, classify the sentiment to four types: neg, neu, pos and compound.
sid = SentimentIntensityAnalyzer()
review['sentiments'] = review['Review'].apply(lambda x: sid.polarity_scores(x))
review = pd.concat([review.drop(['sentiments'], axis=1), review['sentiments'].apply(pd.Series)], axis=1)
review.head()
# add number of characters column
review['word_counts'] = review['Review'].apply(lambda x: len(x.split(" ")))
review.to_csv('data/sid_review.csv', index = False)
# highest positive sentiment reviews (with more than 5 words)
review[review['word_counts'] >= 5].sort_values('pos', ascending = False)[['Review', 'pos']].head(20)
show_wordcloud(review[review['word_counts'] >= 5].sort_values('pos', ascending = False)['Review'].head(50))
# lowest negative sentiment reviews (with more than 5 words)
review[review['word_counts'] >= 5].sort_values('neg', ascending = False)[['Review', 'neg']].head(20)
show_wordcloud(review[review['word_counts'] >= 5].sort_values('neg', ascending = False)['Review'].head(50))
# convert the sentiment, 0 means negative, 1 means positive
sentiment = review[['clean', 'is_neg']]
sentiment.rename(columns = {'clean': 'review', 'is_neg': 'sentiment'}, inplace = True)
sentiment['sentiment'] = sentiment['sentiment'].apply(lambda x: 0 if x > 0 else 1)
sentiment.to_csv('data/sentiment.csv', index = False)
sentiment.head()
# import corresponding packages and create the pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
lr_model = Pipeline(steps = ([('lr', LogisticRegression(penalty = 'l2', max_iter = 500, C = 1, random_state = 42))]))
# split the dataset, 0.8 training and 0.2 testing
X_train, X_test, y_train, y_test = train_test_split(sentiment['review'], sentiment['sentiment'], test_size = 0.2, random_state = 42)
# convert the text documents to bag of words features
cv = CountVectorizer(min_df = 0, max_df = 1, binary = False, ngram_range = (1, 3))
cv_train_reviews = cv.fit_transform(X_train)
cv_test_reviews = cv.transform(X_test)
print('Bag of Words cv train:', cv_train_reviews.shape)
print('Bag of Words cv test:', cv_test_reviews.shape)
# convert the text documents to tfidf features
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
tv_train_reviews=tv.fit_transform(X_train)
tv_test_reviews=tv.transform(X_test)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)
# training the logistic regression model
# fitting for bag of words
lr_bow = lr_model.fit(cv_train_reviews, y_train)
print(lr_bow)
# fitting for tfidf
lr_tfidf = lr_model.fit(tv_train_reviews, y_train)
print(lr_tfidf)
# predict the model for bag of words
lr_bow_pred = lr_model.predict(cv_test_reviews)
print(lr_bow_pred)
# predict the model for tfidf
lr_tfidf_pred = lr_model.predict(tv_test_reviews)
print(lr_tfidf_pred)
# calculate the accuracy of the model in two types
from sklearn import metrics
lr_bow_score = metrics.accuracy_score(y_test, lr_bow_pred)
print("lr_bow_score:", lr_bow_score)
lr_tfidf_score = metrics.accuracy_score(y_test, lr_tfidf_pred)
print("lr_tfidf_score:", lr_tfidf_score)
lr_bow_report=classification_report(y_test,lr_bow_pred,target_names=['Positive','Negative'])
print(lr_bow_report)
lr_tfidf_report=classification_report(y_test,lr_tfidf_pred,target_names=['Positive','Negative'])
print(lr_tfidf_report)
cm_bow=confusion_matrix(y_test,lr_bow_pred,labels=[1,0])
print(cm_bow)
cm_tfidf=confusion_matrix(y_test,lr_tfidf_pred,labels=[1,0])
print(cm_tfidf)
Based on the model evaluation, the logistic regression model performs well. And the difference of using bag of words and tfidf is not obvious.